In this document we present the joint analysis of the PASS1A metabolomics datasets.
Load the data from the cloud, including: phenotypic data, metabolomic datasets, and metabolomics dictionary.
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/supervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/gcp_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/association_analysis_methods.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/data_aux_functions.R")
source("~/Desktop/repos/motrpac/tools/prediction_ml_tools.R")
library(randomForest) # for classification tests
library(ggcorrplot)
# Load the dmaqc data
merged_dmaqc_data = load_from_bucket("merged_dmaqc_data2019-10-15.RData",
"gs://bic_data_analysis/pass1a/pheno_dmaqc/",F)
merged_dmaqc_data = merged_dmaqc_data[[1]]
rownames(merged_dmaqc_data) = as.character(merged_dmaqc_data$vial_label)
# define the tissue variable
merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# define the time to freeze variable
merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min +
merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min
# Add blood freeze times
blood_samples =
merged_dmaqc_data$specimen.processing.sampletypedescription ==
"EDTA Plasma"
blood_freeze_time =
as.difftime(merged_dmaqc_data$specimen.processing.t_freeze,units = "mins") -
as.difftime(merged_dmaqc_data$specimen.processing.t_edtaspin,units="mins")
blood_freeze_time = as.numeric(blood_freeze_time)
merged_dmaqc_data$time_to_freeze[blood_samples] =
blood_freeze_time[blood_samples]
# Load our parsed metabolomics datasets
download_obj = load_from_bucket(
file = "metabolomics_parsed_datasets_pass1a_external1.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
metabolomics_parsed_datasets =
download_obj$metabolomics_parsed_datasets
# Load the processed, merged, normalized datasets
metabolomics_processed_datasets = load_from_bucket(
file="metabolomics_processed_datasets11182019.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/"
)[[1]]
# Define the variables to be adjusted for (differential analysis):
biospec_cols = c(
"acute.test.distance",
"calculated.variables.time_to_freeze",
"bid" # required for matching datasets
)
differential_analysis_cols = c(
"animal.registration.sex",
"animal.key.timepoint",
"animal.key.is_control"
)
pipeline_qc_cols = c("sample_order")
# Get some stats about the dataset
par(mfrow=c(1,2))
num_features = sapply(metabolomics_parsed_datasets,function(x)nrow(x$sample_data))
hist(log10(num_features),xlab="Log10 number of metabolites",main="")
num_samples = sapply(metabolomics_parsed_datasets,function(x)ncol(x$sample_data))
hist(num_samples,xlab = "Number of samples",main="")
Some sites do not use the log transformation on their dataset. In this section we plot the coefficient of variation as a function of the mean instensity. We take a single dataset as an example to show how log-transformed data have reduced dependency and smoother plots.
As an additional analysis we also plot the number of missing values per metabolite as a function of its mean intensity. We show that while there is high correlation some missing values appear in fairely high intensities. This is important for imputation as some sites use some fixed low value instead of knn imputation.
# Plot cv vs means
library(gplots)
d = metabolomics_parsed_datasets[["white_adipose_powder,metab_u_hilicpos,unnamed"]]
dx = d$sample_data
CoV<-function(x){return(sd(x,na.rm = T)/mean(x,na.rm=T))}
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Raw data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Repeat after log2
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Log2 data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Plot number of NAs vs intensity mean
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
num_nas = rowSums(is.na(dx))
df = data.frame(Num_NAs = num_nas[inds],Mean_intensity = dmeans[inds])
rho = cor(df$Num_NAs,df$Mean_intensity,method="spearman")
rho = format(rho,digits=2)
plot(Num_NAs~Mean_intensity,df,cex=0.5,pch=20,
main=paste("Spearman:",rho))
dataset2site = sapply(metabolomics_processed_datasets,
function(x)tolower(unique(x$sample_meta$site)))
dataset2site = gsub(" ","_",dataset2site)
newnames = names(metabolomics_processed_datasets)
newnames = gsub("metab_t_","",newnames)
newnames = gsub("metab_u_","",newnames)
newnames = gsub("_powder","",newnames)
newnames = gsub(",named","",newnames)
inds = !grepl("untargeted",newnames)
newnames[inds] = paste(newnames[inds],dataset2site[inds],sep=",")
names(metabolomics_processed_datasets) = newnames
Correct the meta-data, for the differential analysis:
for(datasetname in names(metabolomics_processed_datasets)){
curr_data = metabolomics_processed_datasets[[datasetname]]$sample_data
curr_meta = metabolomics_processed_datasets[[datasetname]]$sample_meta_parsed
# organize the metadata
curr_meta = curr_meta[,union(biospec_cols,differential_analysis_cols)]
# remove metadata variables with too many NAs
na_counts = apply(is.na(curr_meta),2,sum)
curr_meta = curr_meta[,na_counts/nrow(curr_meta) < 0.1]
metabolomics_processed_datasets[[datasetname]]$sample_meta_parsed = curr_meta
}
Untargeted data are typically log-transformed and analyzed using linear models. On the other hand, concentration data are sometimes analyzed with the same type of models but using the original data. This raises a problem if we wish to compare exact statistics from these data. In this section we perform residual analysis for single metabolites. Our goal is to identify if concentration data behaves “normally” when not log-transformed. The analysis below examines the residuals of the data after fitting linear models for each metabolite, adjusting for freeze time and sex. We then compare the results with and without the log-transformation, counting the number of metabolites with a significant evidence for non-normally distributed residuals.
# check for normality using the Kolmogorov-Smirnov test
is_normal_test<-function(v,samp=10000){
return(ks.test(v,"pnorm",mean(v,na.rm=T),sd(v,na.rm = T))$p.value)
}
# go over the named datasets, get a logged and an unlogged version of
# the data, use these as inputs for the regression
residual_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
if(grepl("untargeted",nn1)){next}
x_log = metabolomics_processed_datasets[[nn1]]$sample_data
x_unlog = 2^x_log
# take the covariates, ignore distances
x_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
curr_covs = x_meta[,intersect(colnames(x_meta),biospec_cols[2])]
curr_covs = data.frame(curr_covs,
sex=x_meta$animal.registration.sex)
# get the lm objects
curr_models = list()
for(tp in unique(x_meta$animal.key.timepoint)){
res_log = apply(
x_log,1,
pass1a_simple_differential_abundance,
tps = x_meta$animal.key.timepoint,tp=tp,
is_control = x_meta$animal.key.is_control,
covs = curr_covs,return_model=T
)
res_unlog = apply(
x_unlog,1,
pass1a_simple_differential_abundance,
tps = x_meta$animal.key.timepoint,tp=tp,
is_control = x_meta$animal.key.is_control,
covs = curr_covs,return_model=T
)
is_norm = cbind(
sapply(res_log,function(x)is_normal_test(residuals(x))),
sapply(res_unlog,function(x)is_normal_test(residuals(x)))
)
colnames(is_norm) = c("log","not log")
curr_models[[as.character(tp)]] = is_norm
}
residual_analysis_results[[nn1]] = curr_models
}
# Is there a significant difference between the two options?
log_vs_unlog_summ_mat = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)
wilcox.test(y[,1],y[,2],paired = T,alternative = "g")$p.value))
# Count the number of non-normal metabolites
num_nonnormal_log = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)sum(y[,1]<0.05)))
num_nonnormal_log =
num_nonnormal_log[,order(colnames(num_nonnormal_log))]
num_nonnormal_unlog = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)sum(y[,2]<0.05)))
num_nonnormal_unlog =
num_nonnormal_unlog[,order(colnames(num_nonnormal_unlog))]
library(corrplot)
par(mar = c(5,5,5,10))
corrplot(t(num_nonnormal_log)- t(num_nonnormal_unlog),
is.corr = F,tl.cex = 0.7)
Compare overlaps, effect sizes, and correlations within tissues. Compare targeted-untargeted pairs only. For differential analysis we use the same model as in the analysis above.
# helper function to transform a metabolomics matrix
# to that of its motrpac compound ids
extract_metab_data_from_row_annot<-function(x,row_annot_x){
# get the coloumn that has the row names
int_sizes = apply(row_annot_x,2,function(x,y)length(intersect(x,y)),y=rownames(x))
ind = which(int_sizes==max(int_sizes,na.rm = T))[1]
row_annot_x = row_annot_x[is.element(row_annot_x[,ind],set=rownames(x)),]
rownames(row_annot_x) = row_annot_x[,ind]
shared = intersect(rownames(row_annot_x),rownames(x))
x = x[shared,]
row_annot_x = row_annot_x[shared,]
rownames(x) = row_annot_x$motrpac_comp_name
return(x)
}
single_metabolite_corrs = list()
single_metabolite_de = c()
named2covered_shared_metabolites = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
single_metabolite_corrs[[nn1]] = list()
named2covered_shared_metabolites[[nn1]] = NULL
for(nn2 in names(metabolomics_processed_datasets)){
if(nn2 == nn1){next}
if(!grepl("untargeted",nn2)){next}
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
nn2_dataset = strsplit(nn2,split=",")[[1]][2]
if(nn1_tissue!=nn2_tissue){next}
# get the numeric datasets and their annotation
x = metabolomics_processed_datasets[[nn1]]$sample_data
y = metabolomics_processed_datasets[[nn2]]$sample_data
row_annot_x = metabolomics_processed_datasets[[nn1]]$row_annot
row_annot_y = metabolomics_processed_datasets[[nn2]]$row_annot
# transform metabolite names to the motrpac comp name
x = extract_metab_data_from_row_annot(x,row_annot_x)
y = extract_metab_data_from_row_annot(y,row_annot_y)
# align the sample sets
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# step 1: merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
# step 2: use the shared bio ids
shared_bids = as.character(intersect(colnames(y),colnames(x)))
x = as.matrix(x[,shared_bids])
y = as.matrix(y[,shared_bids])
# At this point x and y are over the same BIDs, now we add the metadata
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[shared_bids,]
# get the shared matebolites
shared_metabolites = intersect(rownames(x),rownames(y))
shared_metabolites = na.omit(shared_metabolites)
if(length(shared_metabolites)==0){next}
named2covered_shared_metabolites[[nn1]] = union(
named2covered_shared_metabolites[[nn1]],
shared_metabolites
)
# Compute the correlation matrices of the shared metabolites
if(length(shared_metabolites)>1){
corrs =cor(t(x[shared_metabolites,]),
t(y[shared_metabolites,]),method = "spearman")
}
else{
corrs = cor(x[shared_metabolites,],
y[shared_metabolites,],method = "spearman")
}
# take the covariates (ignore distances)
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
curr_covs$sex = y_meta$animal.registration.sex # add sex
# differential analysis
for(tp in unique(y_meta$animal.key.timepoint)){
resx = t(apply(
matrix(x[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F
))
resy = t(apply(
matrix(y[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F
))
# Add dataset information, time point, tissue
# These are important annotations for our summary matrix
# called single_metabolite_de below
added_columns = matrix(cbind(
rep(nn1,length(shared_metabolites)),
rep(nn2,length(shared_metabolites)),
shared_metabolites,
rep(tp,length(shared_metabolites)),
rep(nn1_tissue,length(shared_metabolites))
),nrow=length(shared_metabolites))
resx = cbind(resx,rep(T,nrow(resx)))
colnames(resx)[ncol(resx)] = "is_targeted"
resy = cbind(resy,rep(F,nrow(resy)))
colnames(resy)[ncol(resy)] = "is_targeted"
if(nrow(resx)>1){
resx = cbind(added_columns[,-2],resx)
resy = cbind(added_columns[,-1],resy)
}
else{
resx = c(added_columns[,-2],resx)
resy = c(added_columns[,-1],resy)
}
single_metabolite_de = rbind(single_metabolite_de,resx)
single_metabolite_de = rbind(single_metabolite_de,resy)
}
single_metabolite_corrs[[nn1]][[nn2]] = corrs
}
}
# Reformat the results for an easier comparison later
single_metabolite_de = data.frame(single_metabolite_de)
names(single_metabolite_de) = c(
"dataset","metabolite","tp","tissue",
"Est","Std","Tstat","Pvalue","is_targeted")
for(col in names(single_metabolite_de)[-c(1:4)]){
single_metabolite_de[[col]] = as.numeric(
as.character(single_metabolite_de[[col]]))
}
for(col in names(single_metabolite_de)[1:4]){
single_metabolite_de[[col]] =
as.character(single_metabolite_de[[col]])
}
# Remove duplications
rownames(single_metabolite_de) = NULL
for(nn in names(single_metabolite_de)){
ndig = 5
if(grepl("pval",nn,ignore.case = T)){
ndig = 10
}
if(is.numeric(single_metabolite_de[[nn]])){
single_metabolite_de[[nn]] = round(single_metabolite_de[[nn]],digits = ndig)
}
}
single_metabolite_de = unique(single_metabolite_de)
# a helper function for simplifying dataset names
simplify_metab_dataset_name<-function(s){
s = gsub("metab_u_","",s)
s = gsub(",untargeted","",s)
s = gsub(",named","",s)
return(s)
}
single_metabolite_de[,1] = sapply(single_metabolite_de[,1],
simplify_metab_dataset_name)
We next transform the data above into tables that contain data for each combination of metabolite, time point, and tissue. These are then used for different meta-analyses: (1) a simple random effects analysis, (2) random effects with a binary covariate indicating if a dataset is targeted or untargeted, (3) redo the RE model of (1) with the targeted data only, and (4) redo the RE model of (1) with the untargeted data only.
library(metafor)
meta_analysis_stats = list()
for(tissue in unique(single_metabolite_de$tissue)){
for(tp in unique(single_metabolite_de$tp)){
curr_subset = single_metabolite_de[
single_metabolite_de$tissue==tissue &
single_metabolite_de$tp==tp,]
for(metabolite in unique(curr_subset$metabolite)){
curr_met_data = curr_subset[
curr_subset$metabolite==metabolite,]
curr_met_data$var = curr_met_data$Std^2
re_model1 = NULL;re_model2=NULL
re_model_tar = NULL;re_model_untar = NULL
try({re_model1 = rma.uni(curr_met_data$Est,curr_met_data$var)})
try({re_model2 = rma.mv(curr_met_data$Est,curr_met_data$var,
mods=curr_met_data$is_targeted)})
try({re_model_tar = rma.uni(
curr_met_data[curr_met_data$is_targeted==1,"Est"],
curr_met_data[curr_met_data$is_targeted==1,"var"]
)})
try({re_model_untar = rma.uni(
curr_met_data[curr_met_data$is_targeted==0,"Est"],
curr_met_data[curr_met_data$is_targeted==0,"var"]
)})
meta_analysis_stats[[paste(metabolite,tissue,tp,sep=",")]] =
list(curr_met_data=curr_met_data,re_model1=re_model1,
re_model2 = re_model2,re_model_tar=re_model_tar,
re_model_untar = re_model_untar)
}
}
}
## Error in rma.uni(curr_met_data$Est, curr_met_data$var) :
## Fisher scoring algorithm did not converge. See 'help(rma)' for possible remedies.
We now show some plots to summarize the comparison.
We first plot the number and percentage of metabolites in the targeted datasets that are measured in at least one untargeted dataset.
library(ggplot2)
dataset2num_metabolites = sapply(metabolomics_processed_datasets,
function(x)nrow(x$sample_data))
named_dataset_coverage = sapply(named2covered_shared_metabolites,length)
named_dataset_coverage = data.frame(
name = names(named_dataset_coverage),
percentage = named_dataset_coverage /
dataset2num_metabolites[names(named_dataset_coverage)],
count = named_dataset_coverage,
total = dataset2num_metabolites[names(named_dataset_coverage)]
)
# add datasets with no coverage
all_targeted_datasets = names(metabolomics_processed_datasets)
all_targeted_datasets = all_targeted_datasets[!grepl("untarg",all_targeted_datasets)]
zero_coverage_datasets = setdiff(all_targeted_datasets,
named_dataset_coverage$name)
zero_coverage_datasets = data.frame(
name = zero_coverage_datasets,
percentage = rep(0,length(zero_coverage_datasets)),
count = rep(0,length(zero_coverage_datasets)),
total = dataset2num_metabolites[zero_coverage_datasets]
)
named_dataset_coverage = rbind(named_dataset_coverage,
zero_coverage_datasets)
named_dataset_coverage =
named_dataset_coverage[order(as.character(named_dataset_coverage$name)),]
print(ggplot(named_dataset_coverage, aes(x=name, y=percentage)) +
geom_bar(stat = "identity",width=0.2) + coord_flip() +
geom_text(data=named_dataset_coverage,
aes(name, percentage+0.05, label=count),
position = position_dodge(width=0.9),
size=4) +
ggtitle("Targeted dataset: coverage by untargeted"))
We examine the average absolute correlation between the platforms (within tissues). Whenever two platforms share more than a single metabolite we plot both the average correlation between the same metabolites and between other metabolites. Adding the average correlation between platforms but with different metabolites is important as it gives some perspective to what a significant correlation is. That is, in many cases below, the average correlation may be greater than expected.
# Next examine the Spearman correlations between platforms
mean_abs<-function(x,...){return(mean(abs(x),...))}
sd_abs<-function(x,...){return(sd(abs(x),...))}
extract_diag_vs_non_diag<-function(corrs,func=mean_abs,...){
if(length(corrs)==1){
return(c(same=func(corrs,...),other=NA))
}
same = func(diag(corrs),...)
other = func(
c(corrs[lower.tri(corrs,diag = F)]),...)
return(c(same=same,other=other))
}
# all pairwise correlations
for(tar_dataset in names(single_metabolite_corrs)){
l = single_metabolite_corrs[[tar_dataset]]
if(length(l)==0){next}
corr_info = as.data.frame(t(sapply(l, extract_diag_vs_non_diag)))
corr_sd = as.data.frame(t(sapply(l, extract_diag_vs_non_diag,func=sd_abs)))
rownames(corr_info) = sapply(rownames(corr_info),
function(x)strsplit(x,split=",")[[1]][2])
rownames(corr_info) = gsub("metab_u_","",rownames(corr_info))
rownames(corr_sd) = rownames(corr_info)
names(l) = rownames(corr_info)
corr_info$dataset = rownames(corr_info)
corr_sd$dataset = corr_info$dataset
corr_info = melt(corr_info)
corr_sd = melt(corr_sd)
corr_info$sd = corr_sd$value
print(
ggplot(corr_info, aes(x=dataset, y=value, fill=variable)) +
geom_bar(position=position_dodge(), stat="identity", colour='black') +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd),na.rm=T,
width=.2,position=position_dodge(.9)) +
ggtitle(tar_dataset) + xlab("Untargeted dataset") + ylab("Spearman") +
labs(fill = "Pair type") +
theme(legend.position="top",legend.direction = "horizontal")
)
if(!is.null(dim(l[[1]]))){
print(ggcorrplot(l[[1]],tl.cex=9,tl.srt = 45) +
ggplot2::labs(x = names(l)[1], y = tar_dataset) +
ggplot2::theme(
axis.title.x = element_text(angle = 0, color = 'grey20'),
axis.title.y = element_text(angle = 90, color = 'grey20')
)
)
}
}
# summarize by tissue
tissue2single_metabolite_corrs<-list()
for(tar_dataset in names(single_metabolite_corrs)){
l = single_metabolite_corrs[[tar_dataset]]
if(length(l)==0){next}
tar_tissue = strsplit(tar_dataset,split=",")[[1]][1]
if(! tar_tissue %in% names(tissue2single_metabolite_corrs)){
tissue2single_metabolite_corrs[[tar_tissue]] = c()
}
names(l) = sapply(names(l),function(x)strsplit(x,split=",")[[1]][2])
names(l) = gsub("metab_u_","",names(l))
currvals = c()
for(untar_name in names(l)){
m = l[[untar_name]]
if(is.null(dim(m))||nrow(m)<2){next}
diag_vals = diag(m)
non_diag_vals = m[lower.tri(m)]
currvals = rbind(currvals,
cbind(diag_vals,untar_name,"same"))
currvals = rbind(currvals,
cbind(non_diag_vals,untar_name,"other"))
}
tissue2single_metabolite_corrs[[tar_tissue]] = rbind(
tissue2single_metabolite_corrs[[tar_tissue]],
currvals
)
}
for(tissue in names(tissue2single_metabolite_corrs)){
d = tissue2single_metabolite_corrs[[tissue]]
d = as.data.frame(d)
d[[1]] = as.numeric(as.character(d[[1]]))
means = aggregate(d[[1]],list(dataset=d[[2]],
pair_type = d[[3]]),mean)
sds = aggregate(d[[1]],list(dataset=d[[2]],
pair_type = d[[3]]),sd)
means$sd = sds$x
means$pair_type = as.character(means$pair_type)
means = means[order(as.character(means$pair_type),decreasing = T),]
means$pair_type = factor(means$pair_type,levels = c("same","other"))
print(
ggplot(means, aes(x=dataset, y=x, fill=pair_type,order=pair_type)) +
geom_bar(position=position_dodge(), stat="identity", colour='black') +
geom_errorbar(aes(ymin=pmax(0,x-sd), ymax=pmin(1,x+sd)),na.rm=T,
width=.2,position=position_dodge(.9)) +
ylim(0,1) +
ggtitle(tissue) + xlab("Untargeted dataset") + ylab("Spearman") +
labs(fill = "Pair type") +
theme(legend.position="top",legend.direction = "horizontal")
)
}
Here we go into the single metabolites comparison in greater detail (again, within tissues).
We start with a few examples. Here are the results for lactate in plasma.
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
curr_labels = gsub("plasma,","",
meta_analysis_stats[[lact_example]][[1]][,1])
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = curr_labels,
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
We can now check the same analysis for liver:
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("liver",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
curr_labels = gsub("liver_powder,","",
meta_analysis_stats[[lact_example]][[1]][,1])
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = curr_labels,
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
We now move to a systematic comparison of the datasets. We start with a naive comparison, checking if metabolites are consistently being discovered using a simple \(p=0.001\) threshold for significance.
# Naive comparison
thr = 0.001
naive_analysis_tables = lapply(meta_analysis_stats,
function(x)table(x$curr_met_data[,"Pvalue"]<thr,
x$curr_met_data[,"is_targeted"]))
table_with_sig_results = naive_analysis_tables[
sapply(naive_analysis_tables,nrow)>1
]
percent_with_sig =
length(table_with_sig_results)/length(naive_analysis_tables)
sig_results = sapply(table_with_sig_results,
function(x)paste(x["TRUE",c("0","1")]>0,collapse=","))
sig_results_counts = table(sig_results)
print("Counting the number of metabolites with p<0.001")
## [1] "Counting the number of metabolites with p<0.001"
print(paste("Significant in targeted only:",sig_results_counts["FALSE,TRUE"]))
## [1] "Significant in targeted only: 59"
print(paste("Significant in untargeted only:",sig_results_counts["TRUE,FALSE"]))
## [1] "Significant in untargeted only: 157"
print(paste("Significant in both:",sig_results_counts["TRUE,TRUE"]))
## [1] "Significant in both: 101"
As a more rigorous analysis, we use the meta-analyses to obtain useful statistics for comparison.
# Extract some useful statistics per analysis
# P-value for the difference between targeted and untargeted
targeted_diff_p =
sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])
# P-values - targeted vs. untargeted
pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])
significant_in = rep("None",length(pvals_untar))
significant_in[pvals_tar<0.001] = "Targeted"
significant_in[pvals_untar<0.001] = "Untargeted"
significant_in[pvals_tar<0.001 & pvals_untar<0.001] = "Both"
significant_diff = targeted_diff_p<0.001
df = data.frame(
targeted = -log10(pvals_tar),
untargeted = -log10(pvals_untar),
significant_in = significant_in,
significant_diff = significant_diff
)
rho = cor(pvals_tar,pvals_untar,method = "spearman")
rhop = cor.test(pvals_tar,pvals_untar,method = "spearman")$p.value
print(
ggplot(df, aes(x=targeted, y=untargeted,
shape=significant_diff, color=significant_in)) +
geom_point() +
ggtitle(paste("-log10 p-values, spearman:",format(rho,digits=2),
"(p=",format(rhop,digits=3),")"))
)
print("### Summary of differences in RE models ###")
## [1] "### Summary of differences in RE models ###"
print("Model is significant at p<0.001:")
## [1] "Model is significant at p<0.001:"
print(table(df$significant_in))
##
## Both None Targeted Untargeted
## 205 2086 123 239
print("Adding is_targeted as a covariate has p<0.001:")
## [1] "Adding is_targeted as a covariate has p<0.001:"
print(table(df$significant_diff))
##
## FALSE TRUE
## 2433 220
# Betas - targeted vs. untargeted
betas_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$beta[1,1])
betas_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$beta[1,1])
betas_untar = unlist(betas_untar[sapply(betas_untar,length)>0])
df = data.frame(
targeted = betas_tar,
untargeted = betas_untar,
significant_in = significant_in,
significant_diff = significant_diff
)
rho = cor(betas_untar,betas_tar,method = "spearman")
rhop = cor.test(betas_untar,betas_tar,method = "spearman")$p.value
print(
ggplot(df, aes(x=targeted, y=untargeted,
shape=significant_diff, color=significant_in)) +
geom_point() +
ggtitle(paste("Effect sizes, spearman:",format(rho,digits=2),
"(p=",format(rhop,digits=3),")"))
)
From the plots above we take the most extreme examples and examine their forest plots.
agree_example = names(sample(which(pvals_tar< 1e-10 & pvals_untar < 1e-10 &
targeted_diff_p > 0.01))[1])
simplify_labels_for_forest<-function(s){
s = gsub(",untargeted","",s)
tissue = strsplit(s,split=",")[[1]][1]
s = gsub(paste(tissue,",",sep=""),"",s)
return(s)
}
forest(meta_analysis_stats[[agree_example]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[agree_example]][[1]][,1]),
main = paste(agree_example,"significant in both, tar and untar agree",sep="\n"),
xlab = "Log fc",col = "blue")
agree_p_disagree_beta = names(sample(which(pvals_tar< 1e-10 & pvals_untar < 1e-10 &
targeted_diff_p < 0.001))[1])
forest(meta_analysis_stats[[agree_p_disagree_beta]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[agree_p_disagree_beta]][[1]][,1]),
main = paste(agree_p_disagree_beta,
"significant in both, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
disagree_example1 = names(sample(which(pvals_tar< 1e-20 & pvals_untar >0.1))[1])
forest(meta_analysis_stats[[disagree_example1]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[disagree_example1]][[1]][,1]),
main = paste(disagree_example1,
"significant targeted, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
disagree_example2 = names(sample(which(pvals_tar > 0.1 & pvals_untar < 1e-20))[1])
forest(meta_analysis_stats[[disagree_example2]]$re_model1,
slab = simplify_labels_for_forest(
meta_analysis_stats[[disagree_example2]][[1]][,1]),
main = paste(disagree_example2,
"significant in untargeted, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
Use 5-fold cross validation for analysis within tissues. For each pair of targeted and untargeted datasets from the same tissue, we use the untargeted data as the predictive features and all metabolites in the targeted datasets as the dependent variables. The code below uses feature selection and random forests to train the predictive models.
nfolds = 5
prediction_analysis_results = list()
naive_cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
for(nn2 in names(metabolomics_processed_datasets)){
if(nn2 == nn1){next}
if(!grepl("untargeted",nn2)){next}
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
nn2_dataset = strsplit(nn2,split=",")[[1]][2]
if(nn1_tissue!=nn2_tissue){next}
print(paste("features from:",nn2))
print(paste("labels from:",nn1))
if(runif(1)>0.3){next} # just to get results quickly for a few pairs
currname = paste(nn1,nn2,sep=";")
if(currname %in% names(prediction_analysis_results)){next}
# get the numeric datasets and their annotation
y = metabolomics_processed_datasets[[nn1]]$sample_data
x = metabolomics_processed_datasets[[nn2]]$sample_data
# align the sample sets
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# step 1: merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
# step 2: use the shared bio ids
shared_bids = as.character(intersect(colnames(y),colnames(x)))
x = t(as.matrix(x[,shared_bids]))
y = t(as.matrix(y[,shared_bids]))
# At this point x and y are over the same BIDs, now we add the metadata
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[shared_bids,]
# take the covariates (ignore distances)
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
curr_covs$sex = y_meta$animal.registration.sex # add sex
# add the covariates into x
x = cbind(x,curr_covs)
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),1000)
preds = c();real=c()
for(i in 1:ncol(y)){
if( i %% 10 == 0){print(paste("analyzing metabolite number:",i))}
y_i = y[,i]
i_preds = c();i_real=c()
for(j in 1:nfolds){
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
# model = randomForest(tr_yi,x=tr_x,ntree = 20)
# te_preds = predict(model,newdata = te_x)
model = feature_selection_wrapper(tr_x,tr_yi,
coeff_of_var,randomForest,
topK = numFeatures,ntree=100)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
colnames(preds) = colnames(y)
colnames(real) = colnames(y)
prediction_analysis_results[[currname]] = list(
preds = preds,real=real
)
}
save_to_bucket(prediction_analysis_results,naive_cov_prediction_analysis_results,
file="tar_vs_untar_prediction_analysis_results.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
}
naive_cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
if(grepl("untargeted",nn1)){next}
print(nn1)
# get the numeric datasets and their annotation
y = metabolomics_processed_datasets[[nn1]]$sample_data
bid_y = merged_dmaqc_data[colnames(y),"bid"]
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[as.character(colnames(y)),]
y = t(as.matrix(y))
# take the covariates (ignore distances)
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
curr_covs$sex = y_meta$animal.registration.sex # add sex
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
if( i %% 10 == 0){print(paste("analyzing metabolite number:",i))}
y_i = y[,i]
i_preds = c();i_real=c()
for(j in 1:nfolds){
tr_x = curr_covs[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = curr_covs[folds==j,]
te_y = y_i[folds==j]
# random forest
model = randomForest(tr_yi,x=tr_x,ntree = 20)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
colnames(preds) = colnames(y)
colnames(real) = colnames(y)
naive_cov_prediction_analysis_results[[nn1]] = list(
preds = preds,real=real
)
}
save_to_bucket(prediction_analysis_results, naive_cov_prediction_analysis_results,
file="tar_vs_untar_prediction_analysis_results.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
We now take the predicted and real values and estimate the prediction accuracy in different ways.
prediction_analysis_results = load_from_bucket(
"tar_vs_untar_prediction_analysis_results.RData",
"gs://bic_data_analysis/pass1a/metabolomics/",F)
prediction_analysis_results = prediction_analysis_results[[1]]
all_pred_results = c(prediction_analysis_results,naive_cov_prediction_analysis_results)
results_metrics = list()
for(nn in names(all_pred_results)){
preds = all_pred_results[[nn]]$preds
real = all_pred_results[[nn]]$real
tar_name = strsplit(nn,split=";")[[1]][1]
untar_name = strsplit(nn,split=";")[[1]][2]
y = metabolomics_processed_datasets[[tar_name]]$sample_data
colnames(preds) = rownames(y)
colnames(real) = rownames(y)
tar_name = simplify_metab_dataset_name(tar_name)
untar_name = simplify_metab_dataset_name(untar_name)
currtissue = strsplit(tar_name,split=",")[[1]][1]
tar_name = gsub(paste(currtissue,",",sep=""),"",tar_name)
untar_name = gsub(paste(currtissue,",",sep=""),"",untar_name)
if(is.na(untar_name)){untar_name = tar_name}
if(! currtissue %in% names(results_metrics)){
results_metrics[[currtissue]] = list()
}
if(! tar_name %in% names(results_metrics[[currtissue]])){
results_metrics[[currtissue]][[tar_name]] = list()
}
rhos = format(diag(cor(preds,real,method="spearman")),digits=3)
rhos = as.numeric(rhos)
SEs = colSums((preds-real)^2)
MSEs = SEs / nrow(preds)
RMSE = sqrt(MSEs)
rMSE = MSEs / apply(y,1,var)
CoVs = apply(y,1,sd) / apply(y,1,mean)
discCoVs = cut(CoVs,breaks = 2,ordered_result = T)
results_metrics[[currtissue]][[tar_name]][[untar_name]] = data.frame(
rhos,MSEs,RMSE,rMSE,CoVs,discCoVs
)
}
We now present a few summary plots.
for(tissue in names(results_metrics)){
tissue_summary = c()
for(tar in names(results_metrics[[tissue]])){
l = results_metrics[[tissue]][[tar]]
rho_vs_cv = c()
for(untar in names(l)){
m = l[[untar]][,c("rhos","discCoVs")] # take the current matrix
untar_name = untar
if(untar == tar){
untar_name = "covarites"
}
m = cbind(rep(untar_name,nrow(m)),m)
m$discCoVs = as.numeric(m$discCoVs)
rho_vs_cv = rbind(rho_vs_cv,m)
}
colnames(rho_vs_cv)[1] = "dataset"
tissue_summary = rbind(tissue_summary,rho_vs_cv)
}
boxplot(rhos~dataset,data=tissue_summary,las=2,
ylab="Spearman",xlab = "",ylim=c(0,1),
main = paste(tissue,sep=","))
print(table(tissue_summary$dataset,tissue_summary$rhos>0.7))
}
As additional references, we train below additional models. First, we check the prediction of naive models that use technical and clinical covariates only. Second, we use multi-task regression and deep learning models.
cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
print(nn1)
y = metabolomics_processed_datasets[[nn1]]$sample_data
y_vials = colnames(y)
bid_y = merged_dmaqc_data[colnames(y),"bid"]
colnames(y) = bid_y
y = t(as.matrix(y))
if(ncol(y)>1000){next}
cov_cols = c("animal.registration.sex",
"acute.test.weight",
"acute.test.distance",
"animal.key.timepoint")
covs = merged_dmaqc_data[y_vials,cov_cols]
x = covs
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
print(j)
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
model = randomForest(tr_yi,x=tr_x,ntree = 20)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
cov_prediction_analysis_results[[nn1]] = list(
preds = preds,real=real
)
}
# preds = c();real=c()
# for(j in 1:nfolds){
# tr_x = x[folds!=j,]
# tr_y = y[folds!=j,]
# te_x = x[folds==j,]
# te_y = y[folds==j,]
# model = MTL_wrapper(tr_x,tr_y,type="Regression", Regularization="L21")
# te_preds = predict(model,te_x)
# real = rbind(real,te_y)
# preds = rbind(preds,te_preds)
# }
# diag(cor(preds,real))
# Using PLS regression
# library(pls)
# pls_model = plsr(y~x,ncomp = 5,validation="LOO")
# eval = MSEP(pls_model)
#
# y_pca = prcomp(y)
# plot(y_pca)
# explained_var = y_pca$sdev^2/sum(y_pca$sdev^2)
# y_pca_matrix = y_pca$x[,1:10]
#
# # regress out sex, weight
#
# get_explained_variance_using_PCA(x,y)
# x = apply(x,2,regress_out,covs=covs)
# y = apply(y,2,regress_out,covs=covs)
# get_explained_variance_using_PCA(x,y)